R Markdown

# Read csv from project dir
 life_df <- read_csv("C:/Users/e005108/Downloads/datasets_12603_17232_Life Expectancy Data.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Country = col_character(),
##   Status = col_character()
## )
## See spec(...) for full column specifications.
 #life_df <- read_csv(file.choose())  
#life_df <- read_csv("data/LifeExpectancyData_w_regions.csv")
life_df <- 
  life_df %>%
  filter(Year == 2014)
head(life_df)
## # A tibble: 6 x 22
##   Country  Year Status `Life expectanc~ `Adult Mortalit~ `infant deaths` Alcohol
##   <chr>   <dbl> <chr>             <dbl>            <dbl>           <dbl>   <dbl>
## 1 Afghan~  2014 Devel~             59.9              271              64    0.01
## 2 Albania  2014 Devel~             77.5                8               0    4.51
## 3 Algeria  2014 Devel~             75.4               11              21    0.01
## 4 Angola   2014 Devel~             51.7              348              67    8.33
## 5 Antigu~  2014 Devel~             76.2              131               0    8.56
## 6 Argent~  2014 Devel~             76.2              118               8    7.93
## # ... with 15 more variables: `percentage expenditure` <dbl>, `Hepatitis
## #   B` <dbl>, Measles <dbl>, BMI <dbl>, `under-five deaths` <dbl>, Polio <dbl>,
## #   `Total expenditure` <dbl>, Diphtheria <dbl>, `HIV/AIDS` <dbl>, GDP <dbl>,
## #   Population <dbl>, `thinness 1-19 years` <dbl>, `thinness 5-9 years` <dbl>,
## #   `Income composition of resources` <dbl>, Schooling <dbl>
for (i in 1:length(names(life_df))) {
  ifelse(
    grep(" ",names(life_df)[i]) == TRUE,
    names(life_df)[i] <- gsub(" ", "_", names(life_df)[i]),
    next
  )
}
names(life_df)
##  [1] "Country"                         "Year"                           
##  [3] "Status"                          "Life_expectancy"                
##  [5] "Adult_Mortality"                 "infant_deaths"                  
##  [7] "Alcohol"                         "percentage_expenditure"         
##  [9] "Hepatitis_B"                     "Measles"                        
## [11] "BMI"                             "under-five_deaths"              
## [13] "Polio"                           "Total_expenditure"              
## [15] "Diphtheria"                      "HIV/AIDS"                       
## [17] "GDP"                             "Population"                     
## [19] "thinness__1-19_years"            "thinness_5-9_years"             
## [21] "Income_composition_of_resources" "Schooling"
summary(life_df)
##    Country               Year         Status          Life_expectancy
##  Length:183         Min.   :2014   Length:183         Min.   :48.10  
##  Class :character   1st Qu.:2014   Class :character   1st Qu.:65.60  
##  Mode  :character   Median :2014   Mode  :character   Median :73.60  
##                     Mean   :2014                      Mean   :71.54  
##                     3rd Qu.:2014                      3rd Qu.:76.85  
##                     Max.   :2014                      Max.   :89.00  
##                                                                      
##  Adult_Mortality infant_deaths       Alcohol       percentage_expenditure
##  Min.   :  1.0   Min.   :  0.00   Min.   : 0.010   Min.   :    0.00      
##  1st Qu.: 66.0   1st Qu.:  0.00   1st Qu.: 0.010   1st Qu.:   11.06      
##  Median :135.0   Median :  2.00   Median : 0.320   Median :  151.10      
##  Mean   :148.7   Mean   : 24.56   Mean   : 3.271   Mean   : 1001.91      
##  3rd Qu.:216.5   3rd Qu.: 18.00   3rd Qu.: 6.700   3rd Qu.:  703.21      
##  Max.   :522.0   Max.   :957.00   Max.   :15.190   Max.   :19479.91      
##                                   NA's   :1                              
##   Hepatitis_B       Measles           BMI        under-five_deaths
##  Min.   : 2.00   Min.   :    0   Min.   : 2.00   Min.   :   0.00  
##  1st Qu.:79.00   1st Qu.:    0   1st Qu.:23.20   1st Qu.:   0.00  
##  Median :93.00   Median :   13   Median :47.40   Median :   3.00  
##  Mean   :83.12   Mean   : 1831   Mean   :41.03   Mean   :  32.89  
##  3rd Qu.:97.00   3rd Qu.:  316   3rd Qu.:59.80   3rd Qu.:  22.00  
##  Max.   :99.00   Max.   :79563   Max.   :77.10   Max.   :1200.00  
##  NA's   :10                      NA's   :2                        
##      Polio       Total_expenditure   Diphtheria       HIV/AIDS    
##  Min.   : 8.00   Min.   : 1.210    Min.   : 2.00   Min.   :0.100  
##  1st Qu.:80.00   1st Qu.: 4.480    1st Qu.:83.00   1st Qu.:0.100  
##  Median :94.00   Median : 5.840    Median :94.00   Median :0.100  
##  Mean   :84.73   Mean   : 6.201    Mean   :84.08   Mean   :0.682  
##  3rd Qu.:97.00   3rd Qu.: 7.740    3rd Qu.:97.00   3rd Qu.:0.400  
##  Max.   :99.00   Max.   :17.140    Max.   :99.00   Max.   :9.400  
##                  NA's   :2                                        
##       GDP              Population        thinness__1-19_years
##  Min.   :    12.28   Min.   :4.100e+01   Min.   : 0.100      
##  1st Qu.:   617.99   1st Qu.:2.869e+05   1st Qu.: 1.500      
##  Median :  3154.51   Median :1.568e+06   Median : 3.300      
##  Mean   : 10015.57   Mean   :2.106e+07   Mean   : 4.533      
##  3rd Qu.:  8239.95   3rd Qu.:8.080e+06   3rd Qu.: 6.600      
##  Max.   :119172.74   Max.   :1.294e+09   Max.   :26.800      
##  NA's   :28          NA's   :41          NA's   :2           
##  thinness_5-9_years Income_composition_of_resources   Schooling    
##  Min.   : 0.100     Min.   :0.3450                  Min.   : 4.90  
##  1st Qu.: 1.500     1st Qu.:0.5700                  1st Qu.:10.80  
##  Median : 3.400     Median :0.7220                  Median :13.00  
##  Mean   : 4.676     Mean   :0.6884                  Mean   :12.89  
##  3rd Qu.: 6.600     3rd Qu.:0.7960                  3rd Qu.:14.90  
##  Max.   :27.400     Max.   :0.9450                  Max.   :20.40  
##  NA's   :2          NA's   :10                      NA's   :10
gg_miss_var(life_df)

life_df %>%
  filter(
    Status == "Developed"
  ) %>%
  ggplot() +
  geom_col(
    aes(
      x = reorder(Country, Life_expectancy),
      y = Life_expectancy,
      fill = Status
    )
  ) +
  coord_flip() +
  theme(
    axis.text.y = element_text(size=6),
  )

life_df %>%
  filter(
    Status == "Developing"
  ) %>%
  ggplot() +
  geom_col(
    aes(
      x = reorder(Country, Life_expectancy),
      y = Life_expectancy,
      fill = Status
    )
  ) +
  coord_flip() +
  theme(
    axis.text.y = element_text(size=2),
  )

life <- life_df %>% 
  drop_na()
summary(life)
##    Country               Year         Status          Life_expectancy
##  Length:131         Min.   :2014   Length:131         Min.   :48.10  
##  Class :character   1st Qu.:2014   Class :character   1st Qu.:64.65  
##  Mode  :character   Median :2014   Mode  :character   Median :72.00  
##                     Mean   :2014                      Mean   :70.52  
##                     3rd Qu.:2014                      3rd Qu.:75.80  
##                     Max.   :2014                      Max.   :89.00  
##  Adult_Mortality infant_deaths       Alcohol       percentage_expenditure
##  Min.   :  2.0   Min.   :  0.00   Min.   : 0.010   Min.   :    0.443     
##  1st Qu.: 74.5   1st Qu.:  0.00   1st Qu.: 0.010   1st Qu.:   48.311     
##  Median :144.0   Median :  3.00   Median : 0.010   Median :  198.734     
##  Mean   :160.4   Mean   : 28.56   Mean   : 3.061   Mean   :  850.874     
##  3rd Qu.:225.0   3rd Qu.: 20.00   3rd Qu.: 6.305   3rd Qu.:  718.324     
##  Max.   :522.0   Max.   :957.00   Max.   :15.190   Max.   :16255.162     
##   Hepatitis_B       Measles             BMI        under-five_deaths
##  Min.   : 2.00   Min.   :    0.0   Min.   : 2.00   Min.   :   0.00  
##  1st Qu.:78.00   1st Qu.:    0.0   1st Qu.:22.85   1st Qu.:   1.00  
##  Median :91.00   Median :   10.0   Median :45.90   Median :   3.00  
##  Mean   :81.71   Mean   : 2042.9   Mean   :40.48   Mean   :  38.24  
##  3rd Qu.:96.00   3rd Qu.:  289.5   3rd Qu.:59.45   3rd Qu.:  25.50  
##  Max.   :99.00   Max.   :79563.0   Max.   :77.10   Max.   :1200.00  
##      Polio      Total_expenditure   Diphtheria       HIV/AIDS     
##  Min.   : 8.0   Min.   : 1.210    Min.   : 2.00   Min.   :0.1000  
##  1st Qu.:78.0   1st Qu.: 4.485    1st Qu.:80.00   1st Qu.:0.1000  
##  Median :92.0   Median : 5.820    Median :92.00   Median :0.1000  
##  Mean   :83.5   Mean   : 6.107    Mean   :83.89   Mean   :0.8099  
##  3rd Qu.:97.0   3rd Qu.: 7.630    3rd Qu.:97.00   3rd Qu.:0.5000  
##  Max.   :99.0   Max.   :13.730    Max.   :99.00   Max.   :9.4000  
##       GDP              Population        thinness__1-19_years
##  Min.   :    12.28   Min.   :4.100e+01   Min.   : 0.100      
##  1st Qu.:   554.92   1st Qu.:2.876e+05   1st Qu.: 1.500      
##  Median :  2522.80   Median :1.563e+06   Median : 3.300      
##  Mean   :  7256.85   Mean   :2.227e+07   Mean   : 4.648      
##  3rd Qu.:  7438.05   3rd Qu.:8.059e+06   3rd Qu.: 6.650      
##  Max.   :119172.74   Max.   :1.294e+09   Max.   :26.800      
##  thinness_5-9_years Income_composition_of_resources   Schooling    
##  Min.   : 0.100     Min.   :0.3450                  Min.   : 5.30  
##  1st Qu.: 1.550     1st Qu.:0.5440                  1st Qu.:10.75  
##  Median : 3.500     Median :0.6970                  Median :12.70  
##  Mean   : 4.886     Mean   :0.6697                  Mean   :12.68  
##  3rd Qu.: 6.800     3rd Qu.:0.7790                  3rd Qu.:14.70  
##  Max.   :27.400     Max.   :0.9360                  Max.   :20.40
range(life$Life_expectancy)
## [1] 48.1 89.0

22 Variables, 20 of them are Numerical, and 2 of them are Categorical. Variables we should drop: Country, Year Hepatitis.B has the Min Value and 1st Quartile difference of 76 which is too high (Factor?) Polio has the Min value and 1st QUartile difference of 70 which is too high (Factor?) Diphtheria has the min value and 1st quartile difference of 78 which is too high (Factor?)

According to the World Health Organization in 2018 they said that 86% of children in the world are receiving immunizations protecting them from these diseases. Source: https://www.chop.edu/centers-programs/vaccine-education-center/global-immunization/diseases-and-vaccines-world-view Let us use 86% as the benchmark to turn these columns into factors.

life_new <- life %>% 
  select(-Country, -Year) %>% 
  mutate(Hepatitis_B = ifelse(Hepatitis_B < 86, "<86% Immunized", ">=86% Immunized"),
         Polio = ifelse(Polio < 86, "<86% Immunized", ">=86% Immunized"),
         Diphtheria = ifelse(Diphtheria < 86, "<86% Immunized", ">=86% Immunized"),
         Hepatitis_B = as.factor(Hepatitis_B),
         Polio = as.factor(Polio),
         Diphtheria = as.factor(Diphtheria))
str(life_new)
## tibble [131 x 20] (S3: tbl_df/tbl/data.frame)
##  $ Status                         : chr [1:131] "Developing" "Developing" "Developing" "Developing" ...
##  $ Life_expectancy                : num [1:131] 59.9 77.5 75.4 51.7 76.2 74.6 82.7 81.4 72.5 71.4 ...
##  $ Adult_Mortality                : num [1:131] 271 8 11 348 118 12 6 66 119 132 ...
##  $ infant_deaths                  : num [1:131] 64 0 21 67 8 1 1 0 5 98 ...
##  $ Alcohol                        : num [1:131] 0.01 4.51 0.01 8.33 7.93 ...
##  $ percentage_expenditure         : num [1:131] 73.5 428.7 54.2 24 847.4 ...
##  $ Hepatitis_B                    : Factor w/ 2 levels "<86% Immunized",..: 1 2 2 1 2 2 2 2 2 2 ...
##  $ Measles                        : num [1:131] 492 0 0 11699 1 ...
##  $ BMI                            : num [1:131] 18.6 57.2 58.4 22.7 62.2 54.1 66.1 57.1 51.5 17.7 ...
##  $ under-five_deaths              : num [1:131] 86 1 24 101 9 1 1 0 6 121 ...
##  $ Polio                          : Factor w/ 2 levels "<86% Immunized",..: 1 2 2 1 2 2 2 2 2 2 ...
##  $ Total_expenditure              : num [1:131] 8.18 5.88 7.21 3.31 4.79 ...
##  $ Diphtheria                     : Factor w/ 2 levels "<86% Immunized",..: 1 2 2 1 2 2 2 2 2 2 ...
##  $ HIV/AIDS                       : num [1:131] 0.1 0.1 0.1 2 0.1 0.1 0.1 0.1 0.1 0.1 ...
##  $ GDP                            : num [1:131] 613 4576 548 479 12245 ...
##  $ Population                     : num [1:131] 327582 288914 39113313 2692466 42981515 ...
##  $ thinness__1-19_years           : num [1:131] 17.5 1.2 6 8.5 1 2.1 0.6 1.8 2.8 18.1 ...
##  $ thinness_5-9_years             : num [1:131] 17.5 1.3 5.8 8.3 0.9 2.1 0.6 2 2.9 18.6 ...
##  $ Income_composition_of_resources: num [1:131] 0.476 0.761 0.741 0.527 0.825 0.739 0.936 0.892 0.752 0.57 ...
##  $ Schooling                      : num [1:131] 10 14.2 14.4 11.4 17.3 12.7 20.4 15.9 12.2 10 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Country = col_character(),
##   ..   Year = col_double(),
##   ..   Status = col_character(),
##   ..   `Life expectancy` = col_double(),
##   ..   `Adult Mortality` = col_double(),
##   ..   `infant deaths` = col_double(),
##   ..   Alcohol = col_double(),
##   ..   `percentage expenditure` = col_double(),
##   ..   `Hepatitis B` = col_double(),
##   ..   Measles = col_double(),
##   ..   BMI = col_double(),
##   ..   `under-five deaths` = col_double(),
##   ..   Polio = col_double(),
##   ..   `Total expenditure` = col_double(),
##   ..   Diphtheria = col_double(),
##   ..   `HIV/AIDS` = col_double(),
##   ..   GDP = col_double(),
##   ..   Population = col_double(),
##   ..   `thinness  1-19 years` = col_double(),
##   ..   `thinness 5-9 years` = col_double(),
##   ..   `Income composition of resources` = col_double(),
##   ..   Schooling = col_double()
##   .. )

Check the correlation of the numerical variables

life_numerical <- life_new %>% 
  select_if(is.numeric)
ggcorr(life_numerical, 
       label = T, 
       label_size = 2,
       label_round = 2,
       hjust = 1,
       size = 3, 
       color = "black",
       layout.exp = 5,
       low = "forestgreen", 
       mid = "gray95", 
       high = "darkorange",
       name = "Correlation")

Life_expectancy has a strong negative correlation with Adult_Mortality which is understandable since if mortality rate in adults is high, Life exepctancy would be lower. Let us compare our Correlation Matrix with a VIF model to see which columns we should drop.

life_df_fctr2 <- life_new  #removing the country name, indicator vars
life_fctr2model <- lm(Life_expectancy~.,data=life_df_fctr2)  # . means all variable not mpg
vif(life_fctr2model)
##                          Status                 Adult_Mortality 
##                        1.690003                        2.671429 
##                   infant_deaths                         Alcohol 
##                      404.004911                        2.064448 
##          percentage_expenditure                     Hepatitis_B 
##                       11.950842                        7.194311 
##                         Measles                             BMI 
##                        2.867046                        2.193348 
##             `under-five_deaths`                           Polio 
##                      324.203388                        5.264166 
##               Total_expenditure                      Diphtheria 
##                        1.309070                       11.696480 
##                      `HIV/AIDS`                             GDP 
##                        1.880140                       12.482209 
##                      Population          `thinness__1-19_years` 
##                        8.013231                       12.991545 
##            `thinness_5-9_years` Income_composition_of_resources 
##                       13.140587                       11.347212 
##                       Schooling 
##                        7.468005

Life Expenctancy has a strong positive correlation with Schooling and Income_composition_of_resources but Schooling and Income_composition_of_resources have a high correlation with one another. Income_composition_of_resources has a VIF of 11.35 (vs 7.47 for Schooling) so let’s drop that.

Infant_deaths and under.five_deaths have a correlation of 1 (100%) which would tell us that there is multicollinearity between them. Both have extremely high VIFs, so we should drop one of the variables. and I believe under 5 deaths should be dropped since it covers more ages and includes infant deaths within it.

Percentage_expenditure vs GDP is also highly correlated with one another with both values having high VIFs. GDP is a more well known datapoint and has a higher correlation with Life_expectancy, so let’s remove percentage_expenditure.

thinness_5-9_years has a strong correlation with thinness_1-19_years and both have high VIFs. Let’s drop thinness_5-9_years since thinness_1-19_years should have years 5-9 within it as well.

#removes Infant_deaths, Income_composition_of_resources, thinness_5-9_years, and percentage_expenditure.
life_new<- life_new[-c(6,10,18:19)]

life_df_fctr2 <- life_new 
life_fctr2model <- lm(Life_expectancy~.,data=life_df_fctr2)  
vif(life_fctr2model)
##                 Status        Adult_Mortality          infant_deaths 
##               1.676668               2.326290               4.614388 
##                Alcohol            Hepatitis_B                Measles 
##               1.878859               6.867340               2.233165 
##                    BMI                  Polio      Total_expenditure 
##               1.835098               5.164795               1.176686 
##             Diphtheria             `HIV/AIDS`                    GDP 
##              11.267731               1.842168               1.479840 
##             Population `thinness__1-19_years`              Schooling 
##               4.474000               2.234582               3.327405

The VIFs now look good for the numerical variables.

Including Plots

##Check the distribution of the categorical variables

## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
##   Status     count percentage
##   <chr>      <int> <chr>     
## 1 Developed     19 14.5%     
## 2 Developing   112 85.5%

According to the boxplots, the distribution of higher life_expectancy resides in the developed countries, with the plot even showing that the median for developed countries (2nd Quartile) beind skewed higher towards the 3rd quartile

We want to know if there is any significant difference between the average life expectancy in Developed and Developing countries.

summary(aov(Life_expectancy ~ Status, data = life_new))
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## Status        1   2453  2453.1   44.12 7.83e-10 ***
## Residuals   129   7173    55.6                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The ANOVA test tells us that there is a significant difference between the life expectancy of Developed countries and the life expectancy of Developing countries.

## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
##   Hepatitis_B     count percentage
##   <fct>           <int> <chr>     
## 1 <86% Immunized     44 33.59%    
## 2 >=86% Immunized    87 66.41%

More than 1/3 of countries have less than 86% immunized for Hepatitis B. The Life Expectancy of the countries with greater than or equal to 86% Immunized is higher than the countries which have less that 86% Immunized. Note that the median (2nd Quartile) for >=86% immunized is skewed upwards towards the 3rd quartile.

We want to know if there is any significant difference between the average life expectancy in Developed and Developing countries.

summary(aov(Life_expectancy ~ Hepatitis_B, data = life_new))
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## Hepatitis_B   1   1088  1087.6   16.43 8.67e-05 ***
## Residuals   129   8539    66.2                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The ANOVA test tells us that there is a significant difference between the life expectancy of countries that immmunized >=86% and the life expectancy of countries that immunized less than 86% for Hep B.

## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
##   Polio           count percentage
##   <fct>           <int> <chr>     
## 1 <86% Immunized     46 35.11%    
## 2 >=86% Immunized    85 64.89%

For Polio Immunizations, higher life expectancy resides with the countries that have >=86% Immunized. Note that the median (2nd Quartile) for >=86% immunized is skewed upwards towards the 3rd quartile.

summary(aov(Life_expectancy ~ Polio, data = life_new))
##              Df Sum Sq Mean Sq F value  Pr(>F)    
## Polio         1   2295  2294.8   40.38 3.3e-09 ***
## Residuals   129   7332    56.8                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

According to the ANOVA test, there is a significant difference between the life expectancy of countries that immmunized >=86% and the life expectancy of countries that immunized less than 86% for Polio.

## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
##   Diphtheria      count percentage
##   <fct>           <int> <chr>     
## 1 <86% Immunized     39 29.77%    
## 2 >=86% Immunized    92 70.23%

For Diphtheria Immunizations, higher life expectancy resides with the countries that have >=86% Immunized. Note that the median (2nd Quartile) for >=86% immunized is skewed upwards towards the 3rd quartile.

summary(aov(Life_expectancy ~ Diphtheria, data = life_new))
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## Diphtheria    1   1934  1934.3   32.44 7.91e-08 ***
## Residuals   129   7692    59.6                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

According to the ANOVA test, there is a significant difference between the life expectancy of countries that immmunized >=86% and the life expectancy of countries that immunized less than 86% for Diphtheria.

life_plot5 <-  ggplot(life_new) +
            geom_mosaic(aes(x = product(Status), fill=Hepatitis_B)) +
            labs(x = NULL, y = NULL) +
            scale_fill_manual(values=c("forestgreen", "cadetblue")) +
            labs(x = "Status", y = "Hepatitis_B Immunizations") +
            theme(legend.position = "none")

ggplotly(life_plot5) 
chisq.test(table(life_new$Status, life_new$Hepatitis_B))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(life_new$Status, life_new$Hepatitis_B)
## X-squared = 4.1582, df = 1, p-value = 0.04143

A larger percentage of Developed countries have >=86% of their people immunized for Hepatitis_B. The Chi-Squared test tells us that Developed and Developing Countries have different coverage of Hepatitis_B immunizations.

life_plot6 <-  ggplot(life_new) +
            geom_mosaic(aes(x = product(Status), fill=Polio)) +
            labs(x = NULL, y = NULL) +
            scale_fill_manual(values=c("forestgreen", "cadetblue")) +
            labs(x = "Status", y = "Polio Immunizations") +
            theme(legend.position = "none")

ggplotly(life_plot6) 
chisq.test(table(life_new$Status, life_new$Polio))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(life_new$Status, life_new$Polio)
## X-squared = 10.292, df = 1, p-value = 0.001336

ALL of the Developed countries have >=86% of their people immunized for Polio. The Chi-Squared test tells us that Developed and Developing Countries have different coverage of Polio immunizations.

life_plot7 <-  ggplot(life_new) +
            geom_mosaic(aes(x = product(Status), fill=Diphtheria)) +
            labs(x = NULL, y = NULL) +
            scale_fill_manual(values=c("forestgreen", "cadetblue")) +
            labs(x = "Status", y = "Diphtheria Immunizations") +
            theme(legend.position = "none")

ggplotly(life_plot7) 
chisq.test(table(life_new$Status, life_new$Diphtheria))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(life_new$Status, life_new$Diphtheria)
## X-squared = 7.8288, df = 1, p-value = 0.005142

ALL of the Developed countries have >=86% of their people immunized for Diphtheria The Chi-Squared test tells us that Developed and Developing Countries have different coverage of Diphtheria immunizations.

These numbers are very similar to Polio numbers, there could be evidence that Polio and Diphteria immunizations are done at the same time therefore one of them could be dropped. Let us see in the next section.

life_lm <- lm(formula = Life_expectancy ~., data = life_new)
summary(life_lm)
## 
## Call:
## lm(formula = Life_expectancy ~ ., data = life_new)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.2729  -1.7178   0.1095   2.0201   9.3597 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 6.071e+01  3.260e+00  18.624  < 2e-16 ***
## StatusDeveloping           -1.507e+00  1.155e+00  -1.305  0.19454    
## Adult_Mortality            -2.608e-02  4.366e-03  -5.974 2.65e-08 ***
## infant_deaths              -3.499e-03  6.767e-03  -0.517  0.60610    
## Alcohol                     1.174e-01  1.057e-01   1.111  0.26903    
## Hepatitis_B>=86% Immunized -2.035e+00  1.743e+00  -1.168  0.24541    
## Measles                     1.618e-05  4.787e-05   0.338  0.73596    
## BMI                         3.225e-02  2.060e-02   1.565  0.12026    
## Polio>=86% Immunized       -7.357e-01  1.495e+00  -0.492  0.62372    
## Total_expenditure           2.527e-01  1.350e-01   1.871  0.06385 .  
## Diphtheria>=86% Immunized   4.263e+00  2.306e+00   1.849  0.06704 .  
## `HIV/AIDS`                 -1.049e+00  2.740e-01  -3.829  0.00021 ***
## GDP                         4.030e-05  2.602e-05   1.549  0.12418    
## Population                  4.617e-09  5.715e-09   0.808  0.42086    
## `thinness__1-19_years`     -1.017e-01  1.066e-01  -0.953  0.34236    
## Schooling                   9.389e-01  2.091e-01   4.489 1.71e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.595 on 115 degrees of freedom
## Multiple R-squared:  0.8456, Adjusted R-squared:  0.8255 
## F-statistic: 41.99 on 15 and 115 DF,  p-value: < 2.2e-16

Coefficients that are negative: StatusDeveloping, Adult_Mortality, infant_deaths, Hepatitis_B>=86% Immunized, Polio>=86% Immunized, HIV/AIDS, and thinness__1-19_years. This tells us that everytime we add one of these variables we expect Life_expectancy to decrease. Diphtheria>=86% Immunized is expected to raise Life_expectancy while Hepatitis_B>=86% Immunized and Polio>=86% Immunized are exoected to decrease Life_expectancy. Adult_Mortality, HIV/AIDS, and Schooling are seen as the best predictors of Life_expectancy according to the model.

According to the Adj. R-Squared, 82.55% of the variation in Life_expectancy can be determined by the model’s inputed values. This is a very good value for the model.

life_step <- lm(formula = Life_expectancy ~., data = life_new)
life_step0 <- lm(formula = Life_expectancy ~1, data = life_new)
life_backward <- step(life_step, direction = "backward")
## Start:  AIC=350.18
## Life_expectancy ~ Status + Adult_Mortality + infant_deaths + 
##     Alcohol + Hepatitis_B + Measles + BMI + Polio + Total_expenditure + 
##     Diphtheria + `HIV/AIDS` + GDP + Population + `thinness__1-19_years` + 
##     Schooling
## 
##                          Df Sum of Sq    RSS    AIC
## - Measles                 1      1.48 1487.8 348.31
## - Polio                   1      3.13 1489.5 348.46
## - infant_deaths           1      3.46 1489.8 348.49
## - Population              1      8.43 1494.8 348.92
## - `thinness__1-19_years`  1     11.75 1498.1 349.21
## - Alcohol                 1     15.94 1502.3 349.58
## - Hepatitis_B             1     17.62 1504.0 349.73
## - Status                  1     22.01 1508.4 350.11
## <none>                                1486.3 350.18
## - GDP                     1     31.00 1517.3 350.89
## - BMI                     1     31.67 1518.0 350.95
## - Diphtheria              1     44.18 1530.5 352.02
## - Total_expenditure       1     45.26 1531.6 352.11
## - `HIV/AIDS`              1    189.47 1675.8 363.90
## - Schooling               1    260.51 1746.9 369.34
## - Adult_Mortality         1    461.20 1947.5 383.59
## 
## Step:  AIC=348.31
## Life_expectancy ~ Status + Adult_Mortality + infant_deaths + 
##     Alcohol + Hepatitis_B + BMI + Polio + Total_expenditure + 
##     Diphtheria + `HIV/AIDS` + GDP + Population + `thinness__1-19_years` + 
##     Schooling
## 
##                          Df Sum of Sq    RSS    AIC
## - infant_deaths           1      2.43 1490.2 346.53
## - Polio                   1      3.13 1491.0 346.59
## - Population              1     10.98 1498.8 347.28
## - `thinness__1-19_years`  1     13.58 1501.4 347.50
## - Hepatitis_B             1     17.16 1505.0 347.82
## - Alcohol                 1     17.22 1505.0 347.82
## - Status                  1     21.26 1509.1 348.17
## <none>                                1487.8 348.31
## - BMI                     1     30.35 1518.2 348.96
## - GDP                     1     30.38 1518.2 348.96
## - Diphtheria              1     43.59 1531.4 350.10
## - Total_expenditure       1     44.71 1532.5 350.19
## - `HIV/AIDS`              1    195.04 1682.9 362.45
## - Schooling               1    261.98 1749.8 367.56
## - Adult_Mortality         1    460.12 1948.0 381.61
## 
## Step:  AIC=346.53
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + Hepatitis_B + 
##     BMI + Polio + Total_expenditure + Diphtheria + `HIV/AIDS` + 
##     GDP + Population + `thinness__1-19_years` + Schooling
## 
##                          Df Sum of Sq    RSS    AIC
## - Polio                   1      3.40 1493.7 344.83
## - Population              1     12.10 1502.4 345.59
## - `thinness__1-19_years`  1     14.44 1504.7 345.79
## - Hepatitis_B             1     17.05 1507.3 346.02
## - Alcohol                 1     17.62 1507.9 346.07
## - Status                  1     20.18 1510.4 346.29
## <none>                                1490.2 346.53
## - GDP                     1     30.29 1520.5 347.16
## - BMI                     1     30.80 1521.0 347.21
## - Diphtheria              1     45.10 1535.3 348.43
## - Total_expenditure       1     46.63 1536.9 348.56
## - `HIV/AIDS`              1    198.27 1688.5 360.89
## - Schooling               1    268.21 1758.5 366.21
## - Adult_Mortality         1    464.59 1954.8 380.08
## 
## Step:  AIC=344.83
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + Hepatitis_B + 
##     BMI + Total_expenditure + Diphtheria + `HIV/AIDS` + GDP + 
##     Population + `thinness__1-19_years` + Schooling
## 
##                          Df Sum of Sq    RSS    AIC
## - Population              1     12.68 1506.3 343.93
## - `thinness__1-19_years`  1     15.26 1508.9 344.16
## - Alcohol                 1     16.31 1510.0 344.25
## - Hepatitis_B             1     16.77 1510.4 344.29
## - Status                  1     19.58 1513.2 344.53
## <none>                                1493.7 344.83
## - GDP                     1     30.39 1524.0 345.46
## - BMI                     1     31.78 1525.4 345.58
## - Total_expenditure       1     44.86 1538.5 346.70
## - Diphtheria              1     47.92 1541.6 346.96
## - `HIV/AIDS`              1    195.02 1688.7 358.90
## - Schooling               1    268.41 1762.1 364.47
## - Adult_Mortality         1    461.76 1955.4 378.11
## 
## Step:  AIC=343.93
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + Hepatitis_B + 
##     BMI + Total_expenditure + Diphtheria + `HIV/AIDS` + GDP + 
##     `thinness__1-19_years` + Schooling
## 
##                          Df Sum of Sq    RSS    AIC
## - `thinness__1-19_years`  1      5.83 1512.2 342.44
## - Status                  1     18.44 1524.8 343.53
## - Hepatitis_B             1     18.48 1524.8 343.53
## - Alcohol                 1     19.84 1526.2 343.65
## <none>                                1506.3 343.93
## - GDP                     1     30.62 1537.0 344.57
## - BMI                     1     34.43 1540.8 344.89
## - Total_expenditure       1     43.85 1550.2 345.69
## - Diphtheria              1     44.88 1551.2 345.78
## - `HIV/AIDS`              1    198.24 1704.6 358.13
## - Schooling               1    302.29 1808.6 365.89
## - Adult_Mortality         1    459.48 1965.8 376.81
## 
## Step:  AIC=342.44
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + Hepatitis_B + 
##     BMI + Total_expenditure + Diphtheria + `HIV/AIDS` + GDP + 
##     Schooling
## 
##                     Df Sum of Sq    RSS    AIC
## - Hepatitis_B        1     19.05 1531.2 342.08
## - Status             1     19.41 1531.6 342.11
## - Alcohol            1     20.94 1533.1 342.24
## <none>                           1512.2 342.44
## - GDP                1     32.62 1544.8 343.23
## - Diphtheria         1     44.24 1556.4 344.22
## - Total_expenditure  1     46.59 1558.8 344.41
## - BMI                1     52.90 1565.1 344.94
## - `HIV/AIDS`         1    193.78 1705.9 356.23
## - Schooling          1    326.72 1838.9 366.06
## - Adult_Mortality    1    459.47 1971.6 375.20
## 
## Step:  AIC=342.08
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + BMI + 
##     Total_expenditure + Diphtheria + `HIV/AIDS` + GDP + Schooling
## 
##                     Df Sum of Sq    RSS    AIC
## - Status             1     20.58 1551.8 341.83
## <none>                           1531.2 342.08
## - GDP                1     28.71 1559.9 342.51
## - Alcohol            1     29.72 1560.9 342.60
## - Diphtheria         1     40.14 1571.4 343.47
## - Total_expenditure  1     53.10 1584.3 344.54
## - BMI                1     63.53 1594.7 345.40
## - `HIV/AIDS`         1    189.41 1720.6 355.36
## - Schooling          1    315.95 1847.2 364.65
## - Adult_Mortality    1    478.79 2010.0 375.72
## 
## Step:  AIC=341.83
## Life_expectancy ~ Adult_Mortality + Alcohol + BMI + Total_expenditure + 
##     Diphtheria + `HIV/AIDS` + GDP + Schooling
## 
##                     Df Sum of Sq    RSS    AIC
## <none>                           1551.8 341.83
## - Diphtheria         1     40.40 1592.2 343.19
## - Alcohol            1     41.76 1593.5 343.31
## - GDP                1     45.31 1597.1 343.60
## - BMI                1     60.72 1612.5 344.86
## - Total_expenditure  1     61.62 1613.4 344.93
## - `HIV/AIDS`         1    184.99 1736.8 354.58
## - Schooling          1    367.01 1918.8 367.64
## - Adult_Mortality    1    488.49 2040.3 375.68
summary(life_backward)
## 
## Call:
## lm(formula = Life_expectancy ~ Adult_Mortality + Alcohol + BMI + 
##     Total_expenditure + Diphtheria + `HIV/AIDS` + GDP + Schooling, 
##     data = life_new)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.4181  -1.7626   0.2275   1.9425  10.2316 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                5.706e+01  2.391e+00  23.868  < 2e-16 ***
## Adult_Mortality           -2.648e-02  4.273e-03  -6.197 8.08e-09 ***
## Alcohol                    1.799e-01  9.929e-02   1.812 0.072462 .  
## BMI                        4.049e-02  1.853e-02   2.185 0.030808 *  
## Total_expenditure          2.885e-01  1.311e-01   2.201 0.029621 *  
## Diphtheria>=86% Immunized  1.413e+00  7.930e-01   1.782 0.077201 .  
## `HIV/AIDS`                -1.018e+00  2.668e-01  -3.814 0.000216 ***
## GDP                        4.677e-05  2.478e-05   1.887 0.061491 .  
## Schooling                  1.045e+00  1.946e-01   5.372 3.80e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.566 on 122 degrees of freedom
## Multiple R-squared:  0.8388, Adjusted R-squared:  0.8282 
## F-statistic: 79.35 on 8 and 122 DF,  p-value: < 2.2e-16
life_forward <- step(life_step0, scope = list(lower = life_step0, upper = life_step) ,direction = "forward")
## Start:  AIC=564.92
## Life_expectancy ~ 1
## 
##                          Df Sum of Sq    RSS    AIC
## + Schooling               1    6188.7 3437.8 432.03
## + Adult_Mortality         1    5708.6 3917.9 449.15
## + `HIV/AIDS`              1    3666.0 5960.5 504.12
## + BMI                     1    2967.6 6658.9 518.63
## + Alcohol                 1    2699.1 6927.4 523.81
## + Status                  1    2453.1 7173.4 528.38
## + Polio                   1    2294.8 7331.7 531.24
## + Diphtheria              1    1934.3 7692.2 537.53
## + GDP                     1    1895.8 7730.7 538.19
## + `thinness__1-19_years`  1    1837.9 7788.6 539.16
## + Hepatitis_B             1    1087.6 8538.9 551.21
## + Total_expenditure       1     981.5 8645.0 552.83
## + infant_deaths           1     388.7 9237.8 561.52
## <none>                                9626.5 564.92
## + Measles                 1      22.9 9603.6 566.61
## + Population              1      12.5 9614.0 566.75
## 
## Step:  AIC=432.03
## Life_expectancy ~ Schooling
## 
##                          Df Sum of Sq    RSS    AIC
## + Adult_Mortality         1   1393.17 2044.7 365.96
## + `HIV/AIDS`              1   1071.75 2366.1 385.09
## + BMI                     1    167.16 3270.7 427.50
## + Polio                   1     91.70 3346.1 430.49
## + GDP                     1     83.87 3354.0 430.79
## + Status                  1     77.22 3360.6 431.05
## + Total_expenditure       1     63.03 3374.8 431.61
## + Alcohol                 1     55.28 3382.6 431.91
## <none>                                3437.8 432.03
## + Diphtheria              1     35.90 3401.9 432.65
## + `thinness__1-19_years`  1     28.55 3409.3 432.94
## + infant_deaths           1     10.11 3427.7 433.64
## + Hepatitis_B             1      1.14 3436.7 433.99
## + Population              1      0.38 3437.5 434.02
## + Measles                 1      0.02 3437.8 434.03
## 
## Step:  AIC=365.96
## Life_expectancy ~ Schooling + Adult_Mortality
## 
##                          Df Sum of Sq    RSS    AIC
## + `HIV/AIDS`              1   195.968 1848.7 354.76
## + Alcohol                 1   116.246 1928.4 360.29
## + Total_expenditure       1    90.566 1954.1 362.03
## + BMI                     1    72.276 1972.4 363.25
## + Status                  1    71.262 1973.4 363.31
## + GDP                     1    60.149 1984.5 364.05
## + `thinness__1-19_years`  1    36.137 2008.5 365.63
## <none>                                2044.7 365.96
## + Diphtheria              1    21.822 2022.8 366.56
## + Polio                   1    20.848 2023.8 366.62
## + infant_deaths           1     2.592 2042.1 367.79
## + Hepatitis_B             1     0.963 2043.7 367.90
## + Measles                 1     0.589 2044.1 367.92
## + Population              1     0.179 2044.5 367.95
## 
## Step:  AIC=354.76
## Life_expectancy ~ Schooling + Adult_Mortality + `HIV/AIDS`
## 
##                          Df Sum of Sq    RSS    AIC
## + Alcohol                 1    97.412 1751.3 349.67
## + Total_expenditure       1    90.216 1758.5 350.21
## + Status                  1    79.954 1768.8 350.97
## + GDP                     1    68.579 1780.1 351.81
## + BMI                     1    53.390 1795.3 352.92
## + `thinness__1-19_years`  1    43.450 1805.2 353.65
## + Diphtheria              1    37.407 1811.3 354.08
## <none>                                1848.7 354.76
## + Polio                   1    24.960 1823.7 354.98
## + infant_deaths           1     5.061 1843.6 356.40
## + Hepatitis_B             1     4.822 1843.9 356.42
## + Measles                 1     0.736 1848.0 356.71
## + Population              1     0.556 1848.2 356.72
## 
## Step:  AIC=349.67
## Life_expectancy ~ Schooling + Adult_Mortality + `HIV/AIDS` + 
##     Alcohol
## 
##                          Df Sum of Sq    RSS    AIC
## + Total_expenditure       1    70.273 1681.0 346.31
## + BMI                     1    44.275 1707.0 348.32
## + Status                  1    42.635 1708.7 348.44
## + Diphtheria              1    41.585 1709.7 348.52
## + GDP                     1    34.867 1716.4 349.04
## + `thinness__1-19_years`  1    29.835 1721.5 349.42
## <none>                                1751.3 349.67
## + Polio                   1    21.939 1729.3 350.02
## + Hepatitis_B             1    11.233 1740.1 350.83
## + infant_deaths           1     6.459 1744.8 351.19
## + Measles                 1     2.926 1748.4 351.45
## + Population              1     1.422 1749.9 351.56
## 
## Step:  AIC=346.31
## Life_expectancy ~ Schooling + Adult_Mortality + `HIV/AIDS` + 
##     Alcohol + Total_expenditure
## 
##                          Df Sum of Sq    RSS    AIC
## + BMI                     1    45.718 1635.3 344.69
## + GDP                     1    34.194 1646.8 345.61
## + Diphtheria              1    32.955 1648.1 345.71
## + Status                  1    31.648 1649.4 345.82
## <none>                                1681.0 346.31
## + `thinness__1-19_years`  1    24.237 1656.8 346.40
## + Polio                   1    13.696 1667.3 347.23
## + Hepatitis_B             1     9.307 1671.7 347.58
## + infant_deaths           1     2.881 1678.1 348.08
## + Measles                 1     0.812 1680.2 348.24
## + Population              1     0.330 1680.7 348.28
## 
## Step:  AIC=344.69
## Life_expectancy ~ Schooling + Adult_Mortality + `HIV/AIDS` + 
##     Alcohol + Total_expenditure + BMI
## 
##                          Df Sum of Sq    RSS    AIC
## + GDP                     1    43.107 1592.2 343.19
## + Diphtheria              1    38.201 1597.1 343.60
## + Status                  1    36.996 1598.3 343.70
## <none>                                1635.3 344.69
## + Polio                   1    18.619 1616.7 345.19
## + Hepatitis_B             1    15.186 1620.1 345.47
## + `thinness__1-19_years`  1     6.730 1628.6 346.15
## + infant_deaths           1     0.657 1634.6 346.64
## + Population              1     0.069 1635.2 346.69
## + Measles                 1     0.020 1635.3 346.69
## 
## Step:  AIC=343.19
## Life_expectancy ~ Schooling + Adult_Mortality + `HIV/AIDS` + 
##     Alcohol + Total_expenditure + BMI + GDP
## 
##                          Df Sum of Sq    RSS    AIC
## + Diphtheria              1    40.401 1551.8 341.83
## <none>                                1592.2 343.19
## + Status                  1    20.838 1571.4 343.47
## + Polio                   1    19.900 1572.3 343.55
## + Hepatitis_B             1    14.647 1577.5 343.98
## + `thinness__1-19_years`  1     4.249 1587.9 344.84
## + infant_deaths           1     0.438 1591.8 345.16
## + Measles                 1     0.307 1591.9 345.17
## + Population              1     0.296 1591.9 345.17
## 
## Step:  AIC=341.83
## Life_expectancy ~ Schooling + Adult_Mortality + `HIV/AIDS` + 
##     Alcohol + Total_expenditure + BMI + GDP + Diphtheria
## 
##                          Df Sum of Sq    RSS    AIC
## <none>                                1551.8 341.83
## + Status                  1   20.5808 1531.2 342.08
## + Hepatitis_B             1   20.2143 1531.6 342.11
## + `thinness__1-19_years`  1    7.4618 1544.3 343.20
## + Polio                   1    3.3691 1548.4 343.54
## + Measles                 1    3.0547 1548.7 343.57
## + Population              1    2.9088 1548.9 343.58
## + infant_deaths           1    0.4607 1551.3 343.79
summary(life_forward)
## 
## Call:
## lm(formula = Life_expectancy ~ Schooling + Adult_Mortality + 
##     `HIV/AIDS` + Alcohol + Total_expenditure + BMI + GDP + Diphtheria, 
##     data = life_new)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.4181  -1.7626   0.2275   1.9425  10.2316 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                5.706e+01  2.391e+00  23.868  < 2e-16 ***
## Schooling                  1.045e+00  1.946e-01   5.372 3.80e-07 ***
## Adult_Mortality           -2.648e-02  4.273e-03  -6.197 8.08e-09 ***
## `HIV/AIDS`                -1.018e+00  2.668e-01  -3.814 0.000216 ***
## Alcohol                    1.799e-01  9.929e-02   1.812 0.072462 .  
## Total_expenditure          2.885e-01  1.311e-01   2.201 0.029621 *  
## BMI                        4.049e-02  1.853e-02   2.185 0.030808 *  
## GDP                        4.677e-05  2.478e-05   1.887 0.061491 .  
## Diphtheria>=86% Immunized  1.413e+00  7.930e-01   1.782 0.077201 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.566 on 122 degrees of freedom
## Multiple R-squared:  0.8388, Adjusted R-squared:  0.8282 
## F-statistic: 79.35 on 8 and 122 DF,  p-value: < 2.2e-16
life_both <- step(life_step, scope = list(lower = life_step0, upper = life_step) ,direction = "both")
## Start:  AIC=350.18
## Life_expectancy ~ Status + Adult_Mortality + infant_deaths + 
##     Alcohol + Hepatitis_B + Measles + BMI + Polio + Total_expenditure + 
##     Diphtheria + `HIV/AIDS` + GDP + Population + `thinness__1-19_years` + 
##     Schooling
## 
##                          Df Sum of Sq    RSS    AIC
## - Measles                 1      1.48 1487.8 348.31
## - Polio                   1      3.13 1489.5 348.46
## - infant_deaths           1      3.46 1489.8 348.49
## - Population              1      8.43 1494.8 348.92
## - `thinness__1-19_years`  1     11.75 1498.1 349.21
## - Alcohol                 1     15.94 1502.3 349.58
## - Hepatitis_B             1     17.62 1504.0 349.73
## - Status                  1     22.01 1508.4 350.11
## <none>                                1486.3 350.18
## - GDP                     1     31.00 1517.3 350.89
## - BMI                     1     31.67 1518.0 350.95
## - Diphtheria              1     44.18 1530.5 352.02
## - Total_expenditure       1     45.26 1531.6 352.11
## - `HIV/AIDS`              1    189.47 1675.8 363.90
## - Schooling               1    260.51 1746.9 369.34
## - Adult_Mortality         1    461.20 1947.5 383.59
## 
## Step:  AIC=348.31
## Life_expectancy ~ Status + Adult_Mortality + infant_deaths + 
##     Alcohol + Hepatitis_B + BMI + Polio + Total_expenditure + 
##     Diphtheria + `HIV/AIDS` + GDP + Population + `thinness__1-19_years` + 
##     Schooling
## 
##                          Df Sum of Sq    RSS    AIC
## - infant_deaths           1      2.43 1490.2 346.53
## - Polio                   1      3.13 1491.0 346.59
## - Population              1     10.98 1498.8 347.28
## - `thinness__1-19_years`  1     13.58 1501.4 347.50
## - Hepatitis_B             1     17.16 1505.0 347.82
## - Alcohol                 1     17.22 1505.0 347.82
## - Status                  1     21.26 1509.1 348.17
## <none>                                1487.8 348.31
## - BMI                     1     30.35 1518.2 348.96
## - GDP                     1     30.38 1518.2 348.96
## - Diphtheria              1     43.59 1531.4 350.10
## + Measles                 1      1.48 1486.3 350.18
## - Total_expenditure       1     44.71 1532.5 350.19
## - `HIV/AIDS`              1    195.04 1682.9 362.45
## - Schooling               1    261.98 1749.8 367.56
## - Adult_Mortality         1    460.12 1948.0 381.61
## 
## Step:  AIC=346.53
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + Hepatitis_B + 
##     BMI + Polio + Total_expenditure + Diphtheria + `HIV/AIDS` + 
##     GDP + Population + `thinness__1-19_years` + Schooling
## 
##                          Df Sum of Sq    RSS    AIC
## - Polio                   1      3.40 1493.7 344.83
## - Population              1     12.10 1502.4 345.59
## - `thinness__1-19_years`  1     14.44 1504.7 345.79
## - Hepatitis_B             1     17.05 1507.3 346.02
## - Alcohol                 1     17.62 1507.9 346.07
## - Status                  1     20.18 1510.4 346.29
## <none>                                1490.2 346.53
## - GDP                     1     30.29 1520.5 347.16
## - BMI                     1     30.80 1521.0 347.21
## + infant_deaths           1      2.43 1487.8 348.31
## - Diphtheria              1     45.10 1535.3 348.43
## + Measles                 1      0.45 1489.8 348.49
## - Total_expenditure       1     46.63 1536.9 348.56
## - `HIV/AIDS`              1    198.27 1688.5 360.89
## - Schooling               1    268.21 1758.5 366.21
## - Adult_Mortality         1    464.59 1954.8 380.08
## 
## Step:  AIC=344.83
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + Hepatitis_B + 
##     BMI + Total_expenditure + Diphtheria + `HIV/AIDS` + GDP + 
##     Population + `thinness__1-19_years` + Schooling
## 
##                          Df Sum of Sq    RSS    AIC
## - Population              1     12.68 1506.3 343.93
## - `thinness__1-19_years`  1     15.26 1508.9 344.16
## - Alcohol                 1     16.31 1510.0 344.25
## - Hepatitis_B             1     16.77 1510.4 344.29
## - Status                  1     19.58 1513.2 344.53
## <none>                                1493.7 344.83
## - GDP                     1     30.39 1524.0 345.46
## - BMI                     1     31.78 1525.4 345.58
## + Polio                   1      3.40 1490.2 346.53
## + infant_deaths           1      2.69 1491.0 346.59
## - Total_expenditure       1     44.86 1538.5 346.70
## + Measles                 1      0.42 1493.2 346.79
## - Diphtheria              1     47.92 1541.6 346.96
## - `HIV/AIDS`              1    195.02 1688.7 358.90
## - Schooling               1    268.41 1762.1 364.47
## - Adult_Mortality         1    461.76 1955.4 378.11
## 
## Step:  AIC=343.93
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + Hepatitis_B + 
##     BMI + Total_expenditure + Diphtheria + `HIV/AIDS` + GDP + 
##     `thinness__1-19_years` + Schooling
## 
##                          Df Sum of Sq    RSS    AIC
## - `thinness__1-19_years`  1      5.83 1512.2 342.44
## - Status                  1     18.44 1524.8 343.53
## - Hepatitis_B             1     18.48 1524.8 343.53
## - Alcohol                 1     19.84 1526.2 343.65
## <none>                                1506.3 343.93
## - GDP                     1     30.62 1537.0 344.57
## + Population              1     12.68 1493.7 344.83
## - BMI                     1     34.43 1540.8 344.89
## + Measles                 1      7.75 1498.6 345.26
## + Polio                   1      3.97 1502.4 345.59
## + infant_deaths           1      3.61 1502.7 345.62
## - Total_expenditure       1     43.85 1550.2 345.69
## - Diphtheria              1     44.88 1551.2 345.78
## - `HIV/AIDS`              1    198.24 1704.6 358.13
## - Schooling               1    302.29 1808.6 365.89
## - Adult_Mortality         1    459.48 1965.8 376.81
## 
## Step:  AIC=342.44
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + Hepatitis_B + 
##     BMI + Total_expenditure + Diphtheria + `HIV/AIDS` + GDP + 
##     Schooling
## 
##                          Df Sum of Sq    RSS    AIC
## - Hepatitis_B             1     19.05 1531.2 342.08
## - Status                  1     19.41 1531.6 342.11
## - Alcohol                 1     20.94 1533.1 342.24
## <none>                                1512.2 342.44
## - GDP                     1     32.62 1544.8 343.23
## + `thinness__1-19_years`  1      5.83 1506.3 343.93
## + Measles                 1      4.34 1507.8 344.06
## + Polio                   1      4.34 1507.8 344.06
## + Population              1      3.26 1508.9 344.16
## - Diphtheria              1     44.24 1556.4 344.22
## + infant_deaths           1      0.29 1511.9 344.41
## - Total_expenditure       1     46.59 1558.8 344.41
## - BMI                     1     52.90 1565.1 344.94
## - `HIV/AIDS`              1    193.78 1705.9 356.23
## - Schooling               1    326.72 1838.9 366.06
## - Adult_Mortality         1    459.47 1971.6 375.20
## 
## Step:  AIC=342.08
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + BMI + 
##     Total_expenditure + Diphtheria + `HIV/AIDS` + GDP + Schooling
## 
##                          Df Sum of Sq    RSS    AIC
## - Status                  1     20.58 1551.8 341.83
## <none>                                1531.2 342.08
## + Hepatitis_B             1     19.05 1512.2 342.44
## - GDP                     1     28.71 1559.9 342.51
## - Alcohol                 1     29.72 1560.9 342.60
## - Diphtheria              1     40.14 1571.4 343.47
## + `thinness__1-19_years`  1      6.40 1524.8 343.53
## + Measles                 1      4.26 1527.0 343.71
## + Polio                   1      4.06 1527.2 343.73
## + Population              1      3.81 1527.4 343.75
## + infant_deaths           1      0.46 1530.8 344.04
## - Total_expenditure       1     53.10 1584.3 344.54
## - BMI                     1     63.53 1594.7 345.40
## - `HIV/AIDS`              1    189.41 1720.6 355.36
## - Schooling               1    315.95 1847.2 364.65
## - Adult_Mortality         1    478.79 2010.0 375.72
## 
## Step:  AIC=341.83
## Life_expectancy ~ Adult_Mortality + Alcohol + BMI + Total_expenditure + 
##     Diphtheria + `HIV/AIDS` + GDP + Schooling
## 
##                          Df Sum of Sq    RSS    AIC
## <none>                                1551.8 341.83
## + Status                  1     20.58 1531.2 342.08
## + Hepatitis_B             1     20.21 1531.6 342.11
## - Diphtheria              1     40.40 1592.2 343.19
## + `thinness__1-19_years`  1      7.46 1544.3 343.20
## - Alcohol                 1     41.76 1593.5 343.31
## + Polio                   1      3.37 1548.4 343.54
## + Measles                 1      3.05 1548.7 343.57
## + Population              1      2.91 1548.9 343.58
## - GDP                     1     45.31 1597.1 343.60
## + infant_deaths           1      0.46 1551.3 343.79
## - BMI                     1     60.72 1612.5 344.86
## - Total_expenditure       1     61.62 1613.4 344.93
## - `HIV/AIDS`              1    184.99 1736.8 354.58
## - Schooling               1    367.01 1918.8 367.64
## - Adult_Mortality         1    488.49 2040.3 375.68
summary(life_both)
## 
## Call:
## lm(formula = Life_expectancy ~ Adult_Mortality + Alcohol + BMI + 
##     Total_expenditure + Diphtheria + `HIV/AIDS` + GDP + Schooling, 
##     data = life_new)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.4181  -1.7626   0.2275   1.9425  10.2316 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                5.706e+01  2.391e+00  23.868  < 2e-16 ***
## Adult_Mortality           -2.648e-02  4.273e-03  -6.197 8.08e-09 ***
## Alcohol                    1.799e-01  9.929e-02   1.812 0.072462 .  
## BMI                        4.049e-02  1.853e-02   2.185 0.030808 *  
## Total_expenditure          2.885e-01  1.311e-01   2.201 0.029621 *  
## Diphtheria>=86% Immunized  1.413e+00  7.930e-01   1.782 0.077201 .  
## `HIV/AIDS`                -1.018e+00  2.668e-01  -3.814 0.000216 ***
## GDP                        4.677e-05  2.478e-05   1.887 0.061491 .  
## Schooling                  1.045e+00  1.946e-01   5.372 3.80e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.566 on 122 degrees of freedom
## Multiple R-squared:  0.8388, Adjusted R-squared:  0.8282 
## F-statistic: 79.35 on 8 and 122 DF,  p-value: < 2.2e-16

According to each feature selection, the 3 variables that have the most siginificance with regards to Life_expectancy are Adult_Mortality, HIV/AIDS, and Schooling. They have the lowest p-values in each of the selection methods as well as chosen by the model with ’***’ next to them.

Let us create a model with just these variables.

life_highp <- lm(formula = Life_expectancy ~ Adult_Mortality + `HIV/AIDS` + Schooling, data = life_new)

summary(life_highp)
## 
## Call:
## lm(formula = Life_expectancy ~ Adult_Mortality + `HIV/AIDS` + 
##     Schooling, data = life_new)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.4261  -2.1214   0.2122   2.2014   9.4356 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     54.506514   2.298683  23.712  < 2e-16 ***
## Adult_Mortality -0.026817   0.004498  -5.962 2.31e-08 ***
## `HIV/AIDS`      -1.035604   0.282249  -3.669 0.000357 ***
## Schooling        1.668685   0.148224  11.258  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.815 on 127 degrees of freedom
## Multiple R-squared:  0.808,  Adjusted R-squared:  0.8034 
## F-statistic: 178.1 on 3 and 127 DF,  p-value: < 2.2e-16
data.frame(model = c("life_backward","life_forward","life_both", "life_highp"), 
           AdjRsquare = c(summary(life_backward)$adj.r.square,
                          summary(life_forward)$adj.r.square,
                          summary(life_both)$adj.r.square,
                          summary(life_highp)$adj.r.square))
##           model AdjRsquare
## 1 life_backward  0.8282293
## 2  life_forward  0.8282293
## 3     life_both  0.8282293
## 4    life_highp  0.8034202

Given the Adj R-Squared Value for forward, backward, and both direction models are the same, we can choose whichever model we want. We will select Backwards Selection Model.

life_pred <- predict(life_backward, life_new)

data.frame(Method = c("MSE","RMSE","MAE", "MAPE"), 
           Error = c(MSE(life_pred, life_new$Life_expectancy),
                          RMSE(life_pred, life_new$Life_expectancy),
                          MAE(life_pred, life_new$Life_expectancy),
                          MAPE(life_pred, life_new$Life_expectancy)))
##   Method      Error
## 1    MSE 11.8457385
## 2   RMSE  3.4417639
## 3    MAE  2.5772651
## 4   MAPE  0.0375666
 range(life_new$Life_expectancy)
## [1] 48.1 89.0

The error values are quite small compared to the range of values for Life_expectancy. This means we can expect the model’s predicted values to be close to the actual values.

Let us check assumptions of the Model to see if it passes:

hist(life_backward$residuals, breaks = 20)

The histogram looks fairly normally distributed with the higher frequency residuals more towards the center.

plot(life_backward, which = 2)

The QQ plot looks fairly linear as the the majority of the values fall along the line of best fit.

shapiro.test(life_backward$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  life_backward$residuals
## W = 0.97649, p-value = 0.02246

Our Shapiro-Wilk Test shows that we do not pass normality of residuals

plot(cooks.distance(life_backward))

Cook’s D also tells us we have outliers present in the data. We will try taking the Log of the numerical values within the model to see if this sovles our

plot(life_backward$residuals)